In this notebook, we will perform an EDA (Exploratory Data Analysis) on the processed Waymo dataset (data in the processed folder). In the first part, you will create a function to display
from utils import get_dataset
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import patches
import numpy as np
mpl.rcParams['figure.dpi'] = 300
#enable matplotlib in juypter
%matplotlib inline
dataset = get_dataset("data/waymo/training_and_validation/*")
dataset_train = get_dataset("data/output/train/*")
dataset_test = get_dataset("data/output/test/*")
dataset_val = get_dataset("data/output/val/*")
#print(dataset)
#print(type(dataset))
#print(dir(dataset))
Implement the display_instances function below. This function takes a batch as an input and display an image with its corresponding bounding boxes. The only requirement is that the classes should be color coded (eg, vehicles in red, pedestrians in blue, cyclist in green).
global_names = ["unknown", "vehicle", "pedestrian", "sign", "cyclist"]
global_counts = [0] * 5
def display_instances(batch):
"""
This function takes a batch from the dataset and display the image with
the associated bounding boxes.
"""
plt.figure(figsize=(50,20), dpi=300.0)
#print(type(fig))
curr_plot = 0
for entry in batch:
curr_plot += 1
#print(curr_plot)
plt.subplot(10, 1, curr_plot)
add_image(entry)
plt.tight_layout()
plt.show()
def add_image(entry):
#found in wymo docs during excersizes.
color_map = {
0: [0, 0, 0], #black unknown
1: [1, 0, 0], # red vechile
2: [0, 1, 0], # green pedistrian
3: [0, 0, 1], # blue sign
4: [1, 1, 0], # yellow cyclist
}
# enum Type {
#TYPE_UNKNOWN = 0;
#TYPE_VEHICLE = 1;
#TYPE_PEDESTRIAN = 2;
#TYPE_SIGN = 3;
#TYPE_CYCLIST = 4;
#}
img = entry['image'].numpy()
inchPerPix = 1/plt.rcParams['figure.dpi']
fig_size = (inchPerPix * img.shape[0], inchPerPix * img.shape[1])
#Todo change back to subplots and size for 5x2 collums.
#print(fig_size)
#plt.imshow(img, figsize=fig_size)
plt.imshow(img)
#plt.figure(figsize=(3,3), dpi=300.0)
#draw image
#plt.imshow(entry['image'].numpy())
plt.axis("on")
#plt.show()
#draw rectables
ax = plt.gca()
for bbclass, bbox in zip(entry["groundtruth_classes"], entry["groundtruth_boxes"]):
global global_counts
#bbox values are 0-1, convert to pixles
bbox = bbox * img.shape[0]
y1, x1, y2, x2 = bbox
xy = (x1, y1)
width = x2 - x1
height = y2 - y1
bbclass = int(bbclass)
global_counts[bbclass] += 1
rect = patches.Rectangle(xy, width, height, linewidth=1, edgecolor=color_map[bbclass], facecolor='none')
ax.add_patch(rect)
Using the dataset created in the second cell and the function you just coded, display 10 random images with the associated bounding boxes. You can use the methods take and shuffle on the dataset.
#reset stats
global_counts = [0] * 5
seed = 1 #testing set a seed
sampled_data = dataset.shuffle(100, seed)
display_instances(sampled_data.take(10))
In this last part, you are free to perform any additional analysis of the dataset. What else would like to know about the data? For example, think about data distribution. So far, you have only looked at a single file...
First pass is a quick look at the distrubution of types in the currently selected data
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(global_names, global_counts)
plt.show()
Grab a larger data set and count numbers in a big selection. When we break out samples out we want the numbers to be fairly similar breakdown.
We will gather total counts of each type average size of each type
Future types could be average overlaps, ratio and ratio distribution, size distrobutions, etc. For now keeping the stats simple.
def area(bbox):
''' bbox should be in ratio units. This will return a % of area covered 0-1'''
y1, x1, y2, x2 = bbox
width = x2 - x1
height = y2 - y1
return width * height
def calc_stats(batch):
count_type = [0]*5
total_area_type = [0.0]*5
total_obj = 0
for entry in batch:
for bbclass, bbox in zip(entry["groundtruth_classes"], entry["groundtruth_boxes"]):
bbclass = int(bbclass)
count_type[bbclass] += 1
total_area_type[bbclass] += area(bbox)
total_obj +=1
avg_area_type = np.divide(np.array(total_area_type), np.array(count_type), out=np.zeros_like(np.array(total_area_type)), where=np.array(count_type)!=0)
type_ratio = np.divide(np.array(count_type), total_obj)
return {"counts": count_type, "avg_area": avg_area_type, "total_obj": total_obj, "type_ratio": type_ratio}
#warning this cell takes a while, once data is generate just rerun next cell to regraph.
#Around 6k data images
seed = 1 #testing set a seed
stats = calc_stats(dataset.shuffle(10000, seed).take(10000))
#stats = calc_stats(dataset)
#print(stats)
#broke up into 4 steps due to firefox crashing in VM alot
seed = 1 #testing set a seed
stats_train = calc_stats(dataset_train.shuffle(10000, seed).take(10000))
stats_test = calc_stats(dataset_test.shuffle(10000, seed).take(10000))
stats_val = calc_stats(dataset_val.shuffle(10000, seed).take(10000))
fig, axs = plt.subplots(2, 1, constrained_layout=True)
print(stats)
ax = axs[0]
ax.bar(global_names, stats["type_ratio"])
ax.set_title("Ratio of types")
print(f"Vechile Ratio {stats['type_ratio'][1]}")
print(f"Pedestrian Ratio {stats['type_ratio'][2]}")
print(f"cyclist Ratio {stats['type_ratio'][4]}")
print()
ax = axs[1]
ax.set_title("Average Area of Types")
ax.bar(global_names, stats["avg_area"])
plt.show()
Now graph the distrobution of the splits. They are be similar across all 4 groups of data. If any are out of family then maybe our splitting of data was not well done.
fig, axs = plt.subplots(2, 1, constrained_layout=True)
#avg_area_all = [stats["avg_area"], stats_train["avg_area"], stats_test["avg_area"], stats_val["avg_area"]]
X_axis = np.arange(5)
ax = axs[0]
ax.bar(X_axis - 0.4, stats["type_ratio"], 0.2, label = "Full Data")
ax.bar(X_axis - 0.2, stats_train["type_ratio"], 0.2, label = "Training")
ax.bar(X_axis , stats_test["type_ratio"], 0.2, label = "Test")
ax.bar(X_axis + 0.2, stats_val["type_ratio"], 0.2, label = "Validation")
ax.set_title("Ratio of types")
plt.xticks(X_axis, global_names)
print()
ax = axs[1]
ax.set_title("Average Area of Types")
ax.bar(X_axis - 0.4, stats["avg_area"], 0.2, label = "Full Data")
ax.bar(X_axis - 0.2, stats_train["avg_area"], 0.2, label = "Training")
ax.bar(X_axis , stats_test["avg_area"], 0.2, label = "Test")
ax.bar(X_axis + 0.2, stats_val["avg_area"], 0.2, label = "Validation")
#ax.bar(global_names, avg_area_all)
plt.show()
The take away from this is that the validation data tends to have much larger view of pedestrians. This may pose issues so it will be an area to watch for.
The distrubtion of types across the data set is fairly close.